A (very brief) summary of the objective / problem:
WGCNA package to find gene modulescalculateQCMetrics using ERCC and MT genes as feature controls and bulk and empty wells as cell controlsplot, plotQC: highest expression in single cells, bulks and empty wellsplotPhenoData: look at total counts and total features, colour/size by other cell metadataplotPCA: with and without cell QC filteringWhat can you learn about this dataset?
load("jk_anon_data.RData")
sce <- newSCESet(countData = counts(scData), phenoData = scData@phenoData,
featureData = scData@featureData)
exprs(sce) <- exprs(scData)
tpm(sce) <- tpm(scData)
norm_exprs(sce) <- norm_exprs(scData)plot(sce, colour_by = "description", nfeatures = 1000)#Two sets of feature controls: ERCC spike ins & mitochondrial genes
fctrls <- featureNames(sce)[grepl("ERCC-", featureNames(sce))]
fctrls2 <- featureNames(sce)[grepl("MT", fData(sce)$chromosome_name)]
#Cell controls are bulks and empty wells
cctrls <- cellNames(sce)[pData(sce)$description == "bulk"]
cctrls2 <- cellNames(sce)[pData(sce)$description == "empty"]
#Set threshold of 80% of reads from feature controls to flag cell
sce <- calculateQCMetrics(
sce,
feature_controls = list(ERCC_ctrl = fctrls, mt_ctrl = fctrls2),
cell_controls = list(bulks = cctrls, empty = cctrls2),
nmads = 8, pct_feature_controls_threshold = 80)plotQC(sce, type = "highest-expression", exprs_values = "counts", n = 20) +
theme(axis.text.y = element_text(size = 10),
axis.title = element_text(size = 11))p2 <- plotQC(sce[, !sce$is_cell_control],
type = "highest-expression")
p2 + ggtitle(paste0("Single Cells\n",p2$labels$title))p3 <- plotQC(sce[, sce$is_cell_control_bulks],
type = "highest-expression")
p3 + ggtitle(paste0("Bulks\n",p3$labels$title))p4 <- plotQC(sce[, sce$is_cell_control_empty],
type = "highest-expression")
p4 + ggtitle(paste0("Empty wells\n",p4$labels$title))plotQC(sce, type = "exprs-freq-vs-mean", feature_controls = fctrls)levels(sce$description) <- c(levels(sce$description), "failed")
plotPhenoData(sce, aes(x = total_counts, y = total_features,
colour = description))plotPhenoData(sce,
aes(x = pct_tpm_top_200_features, y = total_features,
colour = description, size = CDNA)) +
geom_hline(aes(yintercept = 2500)) + geom_vline(aes(xintercept = 90)) +
ggtitle("Epithelial cell QC")sce$description[
sce$description == "cell" &
sce$total_features < 2500 &
sce$pct_tpm_top_200_features > 75] <- "failed"
knitr::kable(as.data.frame(table(sce$description)))| Var1 | Freq |
|---|---|
| bulk | 2 |
| cell | 18 |
| empty | 2 |
| failed | 1 |
plotPCA(sce, size_by = "CDNA", colour_by = "description")plotPCA(filter(sce, description == "cell"), size_by = "CDNA",
colour_by = "source")Make PCA and t-SNE plots, with points coloured/sized by known marker genes: ITLN1, MUC2, MK167, MCM4, CEACAM1, and SLC26A3
plotPCA(sce[, sce$description == "cell"],
size_by = "ITLN1", colour_by = "MUC2") + ggtitle("goblet cell")plotTSNE(sce[, sce$description == "cell"], size_by = "ITLN1",
colour_by = "MUC2", perplexity = 5, rand_seed = 5) + ggtitle("goblet cell")plotPCA(sce[, sce$description == "cell"],
size_by = "MKI67", colour_by = "MCM4") + ggtitle("transit amplifying")plotTSNE(sce[, sce$description == "cell"], size_by = "MKI67",
colour_by = "MCM4", perplexity = 5, rand_seed = 5)plotPCA(sce[, sce$description == "cell"],
size_by = "SLC26A3", colour_by = "CEACAM1") + ggtitle("absorptive enterocyte")plotTSNE(sce[, sce$description == "cell"], size_by = "SLC26A3",
colour_by = "CEACAM1", perplexity = 5, rand_seed = 5)Requires the following packages: WGCNA (gene correlation analysis) and gplots (heatmaps)
##WGCNA analysis##
##################
#source("http://bioconductor.org/biocLite.R")
#biocLite(c("AnnotationDbi", "impute", "GO.db", "preprocessCore"))
#install.packages('WGCNA')
#install.packages("gplots")
suppressPackageStartupMessages(library("WGCNA"))## ==========================================================================
## *
## * Package WGCNA 1.51 loaded.
## *
## * Important note: It appears that your system supports multi-threading,
## * but it is not enabled within WGCNA in R.
## * To allow multi-threading within WGCNA with all available cores, use
## *
## * allowWGCNAThreads()
## *
## * within R. Use disableWGCNAThreads() to disable threading if necessary.
## * Alternatively, set the following environment variable on your system:
## *
## * ALLOW_WGCNA_THREADS=<number_of_processors>
## *
## * for example
## *
## * ALLOW_WGCNA_THREADS=4
## *
## * To set the environment variable in linux bash shell, type
## *
## * export ALLOW_WGCNA_THREADS=4
## *
## * before running R. Other operating systems or shells will
## * have a similar command to achieve the same aim.
## *
## ==========================================================================
suppressPackageStartupMessages(library("gplots"))WGCNAExtract normalised expression data for cells passing QC
myData <- norm_exprs(sce)[
!apply(norm_exprs(sce)[, sce$description == "cell"], 1, anyNA),
sce$description == "cell"]Transpose the dataset - need samples as rows, features as columns
datExpr = as.data.frame(t(myData))adjacency <- adjacency(datExpr, power = 12, type = "signed hybrid",
corFnc = "bicor",
corOptions = "use = 'p', maxPOutliers = 0.05")## Warning in bicor(datExpr, use = "p", maxPOutliers = 0.05): bicor: zero MAD
## in variable 'x'. Pearson correlation was used for individual columns with
## zero (or missing) MAD.
Convert to dissimilarity matrix
diss <- 1 - adjacencyCall the hierarchical clustering function
geneTree <- hclust(as.dist(diss), method = "average")Module identification using dynamic tree cut:
dynamicMods = cutreeDynamic(
dendro = geneTree, distM = diss, deepSplit = 2, pamStage = FALSE,
pamRespectsDendro = FALSE, minClusterSize = 30, cutHeight = 0.99)## ..done.
table(dynamicMods)## dynamicMods
## 0 1 2 3 4 5 6 7 8 9 10
## 5490 197 103 99 80 55 48 39 38 35 33
names(dynamicMods) <- colnames(datExpr)
# Convert numeric lables into colors
dynamicColors <- labels2colors(dynamicMods)
table(dynamicColors)## dynamicColors
## black blue brown green grey magenta pink
## 39 103 99 55 5490 35 38
## purple red turquoise yellow
## 33 48 197 80
plotDendroAndColors(
geneTree, dynamicColors, "Dynamic Tree Cut", dendroLabels = FALSE,
hang = 0.03, addGuide = TRUE, guideHang = 0.05,
main = "Gene dendrogram and module colors")MEList <- moduleEigengenes(datExpr, colors = dynamicMods)
MEs <- MEList$eigengenes
# Rename to moduleColors
moduleColors <- dynamicColors
# Construct numerical labels corresponding to the colors
colorOrder <- c("grey", standardColors(100))
moduleLabels <- match(moduleColors, colorOrder) - 1
# Recalculate MEs with color labels
MEs0 <- moduleEigengenes(datExpr, moduleColors)$eigengenes
MEs <- orderMEs(MEs0)
# names (colors) of the modules
modNames = substring(names(MEs), 3)myclust <- pData(sce)[sce$description == "cell", "subset"]
datTraits <- data.frame(
goblet = as.numeric(myclust == " goblet"),
ta = as.numeric(myclust == " ta"),
enterocyte = as.numeric(myclust == " enterocyte")
)
nSamples <- nrow(datExpr);
moduleTraitCor <- cor(removeGreyME(MEs), datTraits, use = "p")
moduleTraitPvalue <- corPvalueStudent(moduleTraitCor, nSamples)
# Will display correlations and their p-values
textMatrix <- paste(signif(moduleTraitCor, 2), "\n(",
signif(moduleTraitPvalue, 1), ")", sep = "");
dim(textMatrix) <- dim(moduleTraitCor)
#sizeGrWindow(10,5)par(mar = c(4, 8.5, 3, 3))
labeledHeatmap(Matrix = moduleTraitCor, xLabels = names(datTraits),
yLabels = names(MEs), ySymbols = names(MEs),
colorLabels = FALSE, colors = blueWhiteRed(50),
textMatrix = textMatrix, setStdMargins = FALSE,
cex.text = 1, zlim = c(-1,1), main = paste("Module-trait relationships"))geneModuleMembership <- as.data.frame(cor(datExpr, MEs, use = "p"))
myModGenes <- character()
for (i in c(2,3,6)) {
myModule <- geneModuleMembership[moduleColors == modNames[i], i]
names(myModule) <- row.names(geneModuleMembership)[moduleColors == modNames[i]]
myModule <- myModule[order(myModule, decreasing = TRUE)]
myModGenes <- c(myModGenes, head(names(myModule), 15))
}
myModExprs <- datExpr[,myModGenes]
myModExprs <- t(as.matrix(myModExprs))
hc <- hclust(as.dist(1 - cor(myModExprs, method = "pearson")),
method = "average")heatmap.2(myModExprs, Rowv = NULL, Colv = as.dendrogram(hc),
labCol = rownames(datExpr), col = redgreen(75), scale = "none",
density.info = "none", trace = "none", margins = c(5, 5),
cexRow = 0.5, cexCol = 0.5, dendrogram = "col")But red-green colour schemes are bad (especially for red-green colour-blind people, who make up around 10% of the male population). So don’t use them.
Better…
heatmap.2(myModExprs, Rowv = NULL, Colv = as.dendrogram(hc),
labCol = rownames(datExpr),
col = blueWhiteRed(75), scale = "none",
density.info = "none", trace = "none", margins = c(5, 5),
cexRow = 0.5, cexCol = 0.5, dendrogram = "col")scater coauthors: Quin Wills, Kieran Campbell, Aaron Lun